<?php

set_time_limit(0);

//=================================工具函数=====================
function real_url($current_url, $base_url='') {
    $parse_url_conf = array (
		'scheme' => 'http',
		'host' => '',
		'port' => 80,
		'user' => '',
		'pass' => '',
		'path' => '',
		'query' => '',
		'fragment' => '',
	);
	$current_data = parse_url($current_url);
	$base_data = parse_url($base_url);
	$new_data = array_merge($parse_url_conf, $base_data, $current_data);
	// $url = 'http://username:password@www.infoq.com/cn/minibooks/architect-201410';
	$current_url = sprintf(
		'%s://%s%s?%s#%s',
		// '%s://%s:%s@%s:%s/%s?%s#%s',
		$new_data['scheme'],
		//$new_data['user'],
		//$new_data['pass'],
		$new_data['host'],
		//$new_data['port'],
		$new_data['path'],
		$new_data['query'],
		$new_data['fragment']
	);
	return $current_url;
}
	
function file_export($file_name, $file_content, $content_type='text/doc') {
	header(sprintf("content-type:%s;charset=utf8", $content_type));
	header('Content-Disposition: attachment; filename=' . $file_name);
	echo $file_content;
	exit;
}

// 源文件下载地址 : http://curlmulti.com/index/download/CurlMulti
if ( ! is_file('CurlMulti.php')) {
	$phpQuery = file_get_contents('http://curlmulti.com/index/download/CurlMulti');
	file_put_contents("./CurlMulti.php", $phpQuery);
}
require 'CurlMulti.php';
// 源文件下载地址 ： http://curlmulti.com/index/download/phpQuery
if ( ! is_file('phpQuery.php')) {
	$phpQuery = file_get_contents('http://curlmulti.com/index/download/phpQuery');
	file_put_contents("./phpQuery.php", $phpQuery);
}
require 'phpQuery.php';

class myDebug {
	static $start;
	static $end;
	static $times;
	static function microtime_float(){
		list ($usec, $sec) = explode(" ", microtime());
		return ((float) $usec + (float) $sec);
	}
	public function set_start() {
		self::$start = self::microtime_float();
	}
	public function set_end() {
		self::$end = self::microtime_float();
	}
	public function report() {
		return self::$end - self::$start;
	}
}
class myCurl {

	public $curl;
	public $article_list;
	protected $cacheDir;
	protected $pageCount;
	protected $articleCount;
	protected $request;
	

	public function __construct(request $request){
		$this->_init_request($request);
		$this->_init_curl();
	}
	protected function _init_var() {
		$this->pageCount = 0;
	}
	protected function _init_request(request $request) {
		$this->request = $request;
		// $this->request->cache_path = __DIR__ . '/sjm_cache/';
		// $this->request->fetch_item_query = '#J_posts_list .subject .title a';
		// $this->request->fetch_page_current = '.J_page_wrap .pages strong';
		// $this->request->base_url = 'http://bbs.sijiaomao.com/index.php?m=bbs&c=thread&fid=10&page=%d';

	}
	protected function _init_curl() {
		$this->curl = new CurlMulti();
		$this->cacheDir = $this->request->cache_path . 'cache';
		if (! is_dir($this->cacheDir)) {
			mkdir($this->cacheDir, 777, true);
		}
		$this->cacheDataDir =$this->request->cache_path . 'data';
		if (! is_dir($this->cacheDataDir)) {
			mkdir($this->cacheDataDir, 777, true);
		}
		$this->curl->cache = array(
			'dir' => $this->cacheDir,
			'on' => true,
			'expire' => 3600 * 24
		);
		$this->curl->maxThread = 10;
		$this->curl->opt[CURLOPT_CONNECTTIMEOUT] = 10;
	}

	public function fetch_list(){
		$this->_add_fetch_list_url();
		$this->curl->start();
		$this->_save_article_list();
	}
	public function fetch_article() {
		foreach ($this->article_list as $k => $v) {
			$this->curl->add(array(
				'url' => $v['href']
			), array($this, '_success_article'));
		}
		$this->curl->start();
	}
	public function display() {
		printf(
			"\n共抓取%d个页面\n文章列表%d篇\n相关文章%d篇\n文章目录存放在%s\n",
			$this->pageCount + $this->articleCount,
			$this->pageCount,
			count($this->article_list),
			$this->cacheDataDir . '/list.php'
		);
	}
	public function fetch() {
		return sprintf(
			"\n共抓取%d个页面\n文章列表%d篇\n相关文章%d篇\n文章目录存放在%s\n",
			$this->pageCount + $this->articleCount,
			$this->pageCount,
			count($this->article_list),
			$this->cacheDataDir . '/list.php'
		);
	}
	public function _add_fetch_list_url($url=''){
		$this->curl->add(
			array(
				'url' => $url ?: $this->request->base_url,
			), 
			array($this, '_success_list')
		);
	}
	protected function _save_article_list() {
		$res = file_put_contents(
			$this->cacheDataDir . '/list.php',
			sprintf("<?php\n return\t%s;",
			var_export($this->article_list, true))
		);
		// 相关性排序整理
		/*uasort($this->article_list, function ($a, $b){
			preg_match_all('#([a-zA-Z]+)#is', $a['title'], $match);
			$a_title = strtoupper(implode("", $match[0]));
			
			preg_match_all('#([a-zA-Z]+)#is', $b['title'], $match);
			$b_title = strtoupper(implode("", $match[0]));
			return $a_title > $b_title;
		});*/
		$res = file_put_contents(
			$this->cacheDataDir . '/list.txt',
			array_map(function($a_list){
				$str = sprintf(
					"标题：%s\t超链接:%s \n",
					str_replace(" ", "", $a_list['title']),
					$a_list['href']
				);
				return $str;
			}, $this->article_list)
		);
		return $res;
	}
	public function _success_article($r, $param){
		++$this->articleCount;
	}
	public function _success_list($r, $param){
		++$this->pageCount;
		$html = phpQuery::newDocumentHTML($r['content']);
		$list = $html[$this->request->fetch_item_query];
		foreach ($list as $v) {
			$v = pq($v);

			$item = array(
				"title" => $v->attr('title') ? $v->attr('title') : $v->text(),
				"href" => real_url($v->attr('href'), $this->request->base_url)
			);
			$this->article_list[md5($item['href'])] = $item;
		}
		$page_current = $html[$this->request->fetch_page_current];
		if ($page_current->text() || $page_current->next()->text()) {
			$next_uri = real_url($page_current->attr("href"), $this->request->base_url);
			//var_dump($page_current->text(), $page_current->attr("href"), $next_uri);
			$this->_add_fetch_list_url(
				real_url($page_current->attr("href"), $this->request->base_url)
			);
		}
		
		phpQuery::unloadDocuments();
	}
}
class request{
	/*url*/
	public $base_url; 
	/*缓存文件路径*/
	public $cache_path;
	/*获取元素的CSS选择器*/
	public $fetch_item_query;
	/*分页当前页面元素的CSS选择器*/
	public $fetch_page_current;

	static $instance;
	static public function getInstance() {
		if (empty(self::$instance)) {
			self::$instance = new self;
		}

		return self::$instance;
	}
	private function __construct() {
		$this->_init_base();
	}
	function _init_base() {
		$this->cache_path = __DIR__ . '/'. trim($_POST['cache_path'], '/') .'/';
		$this->fetch_item_query = $_POST['fetch_item_query'];
		$this->fetch_page_current = $_POST['fetch_page_current'];
		$this->base_url = $_POST['url'];
	}
	function request() {
		if (strstr($_POST['url'], '?')) {
			$url = sprintf("%s&auth=%s", $_POST['url'], $auth);
		} else {
			$url = sprintf("%s?auth=%s", $_POST['url'], $auth);
		}
		$param = array();
		if (isset($_POST['param'])) {
			foreach($_POST['param'] as $k => $item) {
				if (!empty($item['method']) && !empty($item['name'])) {
					$param[$item['method']][$item['name']] = $item['value'];
				}
			}
		}
		if (isset($param['get']) && !empty($param['get'])) {
			foreach ($param['get'] as $name => $value) {
				$url = sprintf("%s&%s=%s", $url, $name, $value);
			}
		}
		$post_data = '';
		if (isset($param['post']) && !empty($param['post'])) {
			$post_data = $param['post'];
		}
	}
}
?>
<?php
if (isset($_POST['submit'])) {
	$request = request::getInstance();
	$myCurl = new myCurl($request);
	myDebug::set_start();
	$myCurl->fetch_list();
	
	//$myCurl->fetch_article();
	myDebug::set_end();
} elseif(isset($_POST['export'])) {
	unset($_POST['export']);
	$file_content = var_export($_POST, true);
	$file_name = '抓取配置文件.conf';
	file_export($file_name, $file_content);
} elseif(isset($_POST['import'])) {
	var_dump($_POST);exit;
} else {
	$_POST['url'] = 'http://www.oschina.net/code/tag/php?show=time&lang=&catalog=&p=1';
	$_POST['cache_path'] = 'oschina';
	$_POST['fetch_item_query'] = '.code_list ul li .code_title > a';
	$_POST['fetch_page_current'] = '.pager li.next > a';
}



?>
<html lang="zh-CN">	
<head>
	<meta charset="utf-8">
	<title>页面爬虫</title>
	<link href="http://cdn.bootcss.com/bootstrap/3.2.0/css/bootstrap.min.css" rel="stylesheet">
	<link href="http://cdn.bootcss.com/font-awesome/4.1.0/css/font-awesome.min.css" rel="stylesheet">
	<link href="http://static.bootcss.com/www/assets/css/site.min.css?v3" rel="stylesheet">
	<link href="http://static.bootcss.com/www/assets/ico/favicon.png" rel="shortcut icon">
	<script src="http://cdn.bootcss.com/jquery/1.11.1/jquery.min.js"></script>
</head>
<body>
<div class="container">
		<div class="row row-offcanvas row-offcanvas-right">
			<div class="col-sm-12">
				<div class="row" >
					<div class="col-lg-4">
						<h1>页面爬虫</h1>
						<div class="thumbnail">
						<form class="form-signin" action="" method="post">
							<b>请填URL</b>:
							<input value="<?php echo isset($_POST['url'])?$_POST['url']:'';?>" class="form-control" placeholder="填写完整地址，以http://开头" type="text" name="url" required><br>
							<b>请填缓存文件路径</b>:
							<input value="<?php echo isset($_POST['url'])?$_POST['cache_path']:'';?>" class="form-control" placeholder="填写缓存文件路径" type="text" name="cache_path" required><br>
							<b>请填获取元素的CSS选择器</b>:
							<input value="<?php echo isset($_POST['url'])?$_POST['fetch_item_query']:'';?>" class="form-control" placeholder="填写获取元素的CSS选择器" type="text" name="fetch_item_query" required><br>
							<b>请填分页当前页面元素的CSS选择器</b>:
							<input value="<?php echo isset($_POST['url'])?$_POST['fetch_page_current']:'';?>" class="form-control" placeholder="填写分页当前页面元素的CSS选择器" type="text" name="fetch_page_current" required><br>
							<?php if (isset($_POST['param']) && !empty($_POST['param'])) :?>
								<?php foreach ($_POST['param'] as $k => $item) :?>
									<?php if (!empty($item['method']) && !empty($item['name'])) :?>
										<div class="thumbnail">
											<b>参数name</b>:
											<input value="<?php echo $item['name'];?>" placeholder="请填写" type="text" name="param[<?php echo $k;?>][name]"><br>
											<b>参数value</b>:
											<input value="<?php echo $item['value'];?>" placeholder="请填写" type="text" name="param[<?php echo $k;?>][value]"><br>
											<b>请求方式</b>:
											<label><input <?php if($item['method']=='get'):?>checked<?php endif;?> value="get" type="radio" name="param[<?php echo $k;?>][method]">get</label>
											<label><input <?php if($item['method']=='post'):?>checked<?php endif;?> value="post" type="radio" name="param[<?php echo $k;?>][method]">post</label><br />
											<a href="#" onclick="del_param(this)">删除</a>
										</div>
									<?php endif;?>
								<?php endforeach;?>
							<?php endif;?>
							
							<input type="button" name="add_param" id="add_param" value="添加参数" class="btn btn-lg btn-success btn-block"><br />
							<input type="submit" name="submit" value="开始获取远程数据" class="col-lg-100 btn btn-lg btn-block btn-primary"><br />
							<input type="submit" name="export" value="导出配置" class="btn btn-lg btn-warning btn-block"><br />
						</form>
						<form class="form-signin" action="" id="from_import" method="post">
							<input type="button" name="import" id="import" value="导入配置" class="btn btn-lg btn-warning btn-block"><br />
							<input type="file" name="import_file" class="glyphicon glyphicon-file">
						</form>
						</div>
					</div>
					<div class="col-lg-8">
						<?php
							if (isset($_POST['submit'])) {
								echo "<pre>";
								echo "请求时间:";
								var_dump(myDebug::report());
								
								echo "<br />请求url:";
								isset($request->base_url) && var_dump($request->base_url);
								
								echo "<br />请求参数:";
								isset($param) && var_dump($param);
								
								echo "<hr />结果：";
								var_dump($myCurl->fetch());
								
								echo "</pre>";
							}
						?>
					</div>
				</div>
			</div>
		</div>
		<hr />
	</div>
	<div class="blog-masthead">
		<div class="container">
			<nav class="blog-nav">
				<p class="blog-nav-item">&copy; Company 2014</p>
			</nav>
		</div>
	</div>
</body>
</html>

<script>
	$("#import").click(function(){
		// 文件上传
	});
	$("#add_param").click(function(){
		var input_len = $("form input").size();
		input_len++;
		$(this).before('\
			<div class="thumbnail">\
				<b>参数name</b>:\
				<input value="" placeholder="请填写" type="text" name="param['+ input_len +'][name]"><br>\
				<b>参数value</b>:\
				<input value="" placeholder="请填写" type="text" name="param['+ input_len +'][value]"><br>\
				<b>请求方式</b>:\
				<label><input checked value="get" type="radio" name="param['+ input_len +'][method]">get</label>\
				<label><input value="post" type="radio" name="param['+ input_len +'][method]">post</label><br />\
				<a href="#" onclick="del_param(this)">删除</a>\
			</div>\
		');	
	});
	function del_param(obj) {
		$(obj).parent().remove();
	}
</script>
